This notebook is Tom’s analysis from raw data
source("../CamProt_R/Utility.R")
library(tidyverse)
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mtibble [30m 2.0.0 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mtibble [30m 2.0.0 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mMSnbase[30m::[32mcombine()[30m masks [34mBiobase[30m::combine(), [34mBiocGenerics[30m::combine(), [34mdplyr[30m::combine()
[31m✖[30m [34mS4Vectors[30m::[32mexpand()[30m masks [34mtidyr[30m::expand()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mS4Vectors[30m::[32mfirst()[30m masks [34mdplyr[30m::first()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mBiocGenerics[30m::[32mPosition()[30m masks [34mggplot2[30m::Position(), [34mbase[30m::Position()
[31m✖[30m [34mpurrr[30m::[32mreduce()[30m masks [34mMSnbase[30m::reduce()
[31m✖[30m [34mS4Vectors[30m::[32mrename()[30m masks [34mdplyr[30m::rename()[39m
infile <- "Input/OOPS_qLOPIT_LabelFree_PeptideGroups_parsed.txt"
samples_inf <- "Input/samples.tsv"
peptides_df <- parse_features(infile, silac=FALSE, TMT=FALSE, level="peptide", filter_crap=TRUE)
Tally of features at each stage:
29067 All features with a PSM
These features are associated with 2597 master proteins
27490 Excluding features without a master protein
These features are associated with 2596 master proteins
26424 Excluding features without a unique master protein
These features are associated with 2311 master proteins
26254 Excluding features matching a cRAP protein
These features are associated with 2301 master proteins
25987 Excluding features associated with a cRAP protein
These features are associated with 2283 master proteins
colnames(peptides_df)
[1] "Checked" "Confidence" "Sequence"
[4] "Modifications" "Qvality.PEP" "Qvality.q.value"
[7] "Number.of.Protein.Groups" "Number.of.Proteins" "Number.of.PSMs"
[10] "Master.Protein.Accessions" "Number.of.Missed.Cleavages" "Theo.MHplus.in.Da"
[13] "Found.in.Sample.in.S1.F1.Sample" "Found.in.Sample.in.S2.F2.Sample" "Found.in.Sample.in.S3.F3.Sample"
[16] "Found.in.Sample.in.S4.F4.Sample" "Found.in.Sample.in.S6.F6.Sample" "Found.in.Sample.in.S5.F5.Sample"
[19] "Found.in.Sample.in.S7.F7.Sample" "Found.in.Sample.in.S8.F8.Sample" "Found.in.Sample.in.S9.F9.Sample"
[22] "Found.in.Sample.in.S10.F10.Sample" "Found.in.Sample.in.S11.F11.Sample" "Found.in.Sample.in.S12.F12.Sample"
[25] "Found.in.Sample.in.S13.F13.Sample" "Found.in.Sample.in.S14.F14.Sample" "Found.in.Sample.in.S15.F15.Sample"
[28] "Found.in.Sample.in.S16.F16.Sample" "Found.in.Sample.in.S17.F17.Sample" "Found.in.Sample.in.S18.F18.Sample"
[31] "Found.in.Sample.in.S19.F19.Sample" "Found.in.Sample.in.S20.F20.Sample" "Area.F1.Sample"
[34] "Area.F2.Sample" "Area.F3.Sample" "Area.F4.Sample"
[37] "Area.F6.Sample" "Area.F5.Sample" "Area.F7.Sample"
[40] "Area.F8.Sample" "Area.F9.Sample" "Area.F10.Sample"
[43] "Area.F11.Sample" "Area.F12.Sample" "Area.F13.Sample"
[46] "Area.F14.Sample" "Area.F15.Sample" "Area.F16.Sample"
[49] "Area.F17.Sample" "Area.F18.Sample" "Area.F19.Sample"
[52] "Area.F20.Sample" "XCorr.Sequest.HT" "Confidence.Sequest.HT"
[55] "Percolator.q.Value.Sequest.HT" "Percolator.PEP.Sequest.HT" "master_protein"
[58] "protein_length" "protein_description" "peptide_start"
[61] "peptide_end" "crap_protein" "associated_crap_protein"
[64] "unique" "filename"
peptides_quant <- makeMSNSet(obj=peptides_df, samples_inf=samples_inf, ab_col_ix=2, level="peptide", quant_name="Area")
tallies for missing data (# samples with missing)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
16 35 98 99 100 119 141 144 224 278 327 386 517 684 930 1238 1696 2517 3981 8158 4299
4299 peptides do not have any quantification values
So lots and lots of missing values! most peptides are quantified in only a very minor subset of fractions (<5/20). This is no suprise since we’re dealing with LFQ and subcellular fractionation here.
plotMissing(peptides_quant)
Out of 21688 total features, 21672 (99.926%) have missing values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
35 98 99 100 119 141 144 224 278 327 386 517 684 930 1238 1696 2517 3981 8158
And 21637 features have more than one missing value
What about if we subset to GO annotated RBPs?
human_go <- readRDS("./Input/h_sapiens_go_full.rds")
GO_RBPs <- human_go %>% filter(GO.ID=="GO:0003723") %>% pull(UNIPROTKB)
peptides_quant[fData(peptides_quant)$master_protein %in% GO_RBPs,] %>% plotMissing()
Out of 11023 total features, 11012 (99.9%) have missing values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
26 79 76 78 87 100 106 162 200 228 247 327 397 531 671 894 1253 1844 3706
And 10986 features have more than one missing value
Ok, so this doesn’t make any difference, 99.9% have a missing value!
Let’s aggregate to the unique peptide sequence level (ignorning variable modifications) and then see whether that reduces our problem
peptides_no_mod_quant <- agg_to_peptides(peptides_quant)
Your data contains missing values. Please read the relevant section in the combineFeatures manual page for
details the effects of missing values on data aggregation.
Nope, not really! We still have 19304 features (previously 21688) and 99.9% have missing values!
plotMissing(peptides_no_mod_quant)
Out of 19304 total features, 19288 (99.917%) have missing values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
37 109 102 94 133 146 149 221 298 344 396 526 691 907 1204 1649 2389 3527 6366
And 19251 features have more than one missing value
OK, so we’re going to have to impute. Note that the missing valuesa are particularly present in the first 2 fractions and across fractions 3-7. After that we see fewer missing values. Also, remember that we have yet to identify any definite set of RNAs from the initial fractions (1-7ish). For this reason, we’ll remove these first 5 fractions for now and leave fraction 6 & 7 as these are useful to separate light membranes
plot(colSums(is.na(exprs(peptides_no_mod_quant))))
#peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,8:20]
peptides_no_mod_quant_no_lm <- peptides_no_mod_quant[,6:20]
plotMissing(peptides_no_mod_quant_no_lm)
Out of 19304 total features, 19139 (99.145%) have missing values
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
148 196 227 277 304 338 485 603 808 1070 1564 2371 3506 6476 766
And 18991 features have more than one missing value
Ok, so now we’re down to just 99.1% missing :p but at least we have some more peptides now with relatively few (<=4 missing values)
If we focus on those peptides with 4-9 missing values, we can see that many are missing in a block of sequential fractions. Arguably, these are true missing values, e.g not at random.
missing_n <- rowSums(is.na(exprs(peptides_no_mod_quant_no_lm)))
peptides_no_mod_quant_no_lm[(missing_n>=4 & missing_n<=9),] %>% plotMissing()
Out of 2815 total features, 2815 (100%) have missing values
4 5 6 7 8 9
277 304 338 485 603 808
And 2815 features have more than one missing value
We can identify the missing values which are in a sequential block of >=5 fractions in a row and replace these with zero
First, let’s make a function to identify rows of values where the missing values are not random, e.g 4 or more in a row
test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1)
is_not_at_random <- function(x){
with(rle(is.na(x)), sum(lengths[values] >= 4))
}
is_not_at_random(test_values_1)
[1] 0
is_not_at_random(test_values_2)
[1] 1
is_not_at_random(test_values_3)
[1] 1
Now, let’s check this identifies the peptides which are not missing at random. First, we’ll remove those without at least 4 quantification values.
peptides_no_mod_quant_4 <- peptides_no_mod_quant_no_lm[missing_n<=(ncol(peptides_no_mod_quant_no_lm)-4),]
missing_not_at_random <- apply(exprs(peptides_no_mod_quant_4), 1, is_not_at_random)
peptides_no_mod_quant_4[missing_not_at_random==1,] %>% plotMissing()
Out of 3846 total features, 3846 (100%) have missing values
4 5 6 7 8 9 10 11
16 66 127 290 421 682 956 1288
And 3846 features have more than one missing value
peptides_no_mod_quant_4[missing_not_at_random==2,] %>% plotMissing()
Out of 376 total features, 376 (100%) have missing values
8 9 10 11
4 36 74 262
And 376 features have more than one missing value
Now, let’s extend the function to replace the blocks of missing values with zero
test_values_1 <- c(1,1,NA,1,NA,NA,1,1,1,1)
test_values_2 <- c(1,1,NA,NA,NA,NA,NA,1,1,1)
test_values_3 <- c(NA,NA,NA,NA,1,1,1,1,1, NA)
rle(is.na(test_values_1))$values
[1] FALSE TRUE FALSE TRUE FALSE
rle(is.na(test_values_1))$lengths
[1] 2 1 1 2 4
replace_missing_not_at_random <- function(x){
missing_rle <- rle(is.na(x))
sequential_blocks <- missing_rle$values
sequential_blocks[missing_rle$lengths<4] <- FALSE
replace_with_zero <- rep(sequential_blocks, missing_rle$lengths)
out <- x
out[replace_with_zero]<-0
return(out)
}
replace_missing_not_at_random(test_values_1)
[1] 1 1 NA 1 NA NA 1 1 1 1
replace_missing_not_at_random(test_values_2)
[1] 1 1 0 0 0 0 0 1 1 1
replace_missing_not_at_random(test_values_3)
[1] 0 0 0 0 1 1 1 1 1 NA
Below we impute a maximum of 10 missing values in sequential blocks with zeros
missing_n2 <- rowSums(is.na(exprs(peptides_no_mod_quant_4)))
peptides_no_mod_quant_4_mnar_zero <- peptides_no_mod_quant_4[missing_n2<=10,]
exprs(peptides_no_mod_quant_4_mnar_zero) <- exprs(peptides_no_mod_quant_4_mnar_zero) %>%
apply(1, replace_missing_not_at_random) %>% t()
Re-check the missing values
plotMissing(peptides_no_mod_quant_4)
Out of 6185 total features, 6020 (97.332%) have missing values
1 2 3 4 5 6 7 8 9 10 11
148 196 227 277 304 338 485 603 808 1070 1564
And 5872 features have more than one missing value
plotMissing(peptides_no_mod_quant_4_mnar_zero)
Out of 4621 total features, 4009 (86.756%) have missing values
1 2 3 4 5 6 7 8 9 10
668 789 734 593 429 293 195 178 90 40
And 3341 features have more than one missing value
peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero
imputed_zeros <- rowSums(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)==0, na.rm=TRUE)
missing_n3 <- rowSums(is.na(exprs(peptides_no_mod_quant_4_mnar_zero_mar_knn)))
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation <- imputed_zeros>0
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation_n <- imputed_zeros
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing <- missing_n3>0
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing_n <- missing_n3
peptides_no_mod_quant_4_mnar_zero_mar_knn <- peptides_no_mod_quant_4_mnar_zero_mar_knn[missing_n3<=3,]
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))
FALSE TRUE
736 2067
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing))
FALSE TRUE
612 2191
print(table(fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$zero_imputation))
FALSE TRUE
FALSE 165 447
TRUE 571 1620
dim(peptides_no_mod_quant_4_mnar_zero)
[1] 4621 15
dim(peptides_no_mod_quant_4_mnar_zero_mar_knn)
[1] 2803 15
peptides_no_mod_quant_4_mnar_zero_mar_knn <- suppressMessages(impute(peptides_no_mod_quant_4_mnar_zero_mar_knn, "knn", k = 10))
Cluster size 2803 broken into 2633 170
Cluster size 2633 broken into 2177 456
Cluster size 2177 broken into 252 1925
Done cluster 252
Cluster size 1925 broken into 1390 535
Done cluster 1390
Done cluster 535
Done cluster 1925
Done cluster 2177
Done cluster 456
Done cluster 2633
Done cluster 170
p <- plotLabelQuant(peptides_no_mod_quant_no_lm, log=TRUE)
p <- plotLabelQuant(peptides_no_mod_quant_4_mnar_zero_mar_knn, log=TRUE)
p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)
p <- peptides_no_mod_quant_4_mnar_zero_mar_knn[
!fData(peptides_no_mod_quant_4_mnar_zero_mar_knn)$missing,] %>% plotLabelQuant(log=TRUE)
protein_quant <- agg_to_protein(peptides_no_mod_quant_4_mnar_zero_mar_knn)
print(dim(protein_quant))
[1] 733 15
source("~/git_repos/CamProt_R/LOPIT.R")
markers_df <- read.delim("./Input//markers_9B_hyperLOPIT_vs_DC.csv", sep=",", header=FALSE, stringsAsFactors=FALSE)[,1:2]
markers_df$V2 <- recode(markers_df$V2, "RIBOSOME 40S"="RIBOSOME", "RIBOSOME 60S"="RIBOSOME")
markers_proteins <- markers_df$V2
names(markers_proteins) <- markers_df$V1
protein_quant_am <- addMarkers(normalise(protein_quant, "sum"), markers_proteins)
Markers in data: 123 out of 733
organelleMarkers
CYTOSOL ER GOLGI LYSOSOME MITOCHONDRIA NUCLEUS
8 21 4 2 5 16
NUCLEUS-CHROMATIN PEROXISOME PM PROTEASOME RIBOSOME unknown
6 1 18 3 39 610
p <- plotHexbin(protein_quant_am, "markers")
print(p)
marker_classes <- getMarkerClasses(protein_quant_am, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2_inc_glycoproteins.png")
Saving 7.29 x 4.5 in image
p <- plotPCA(protein_quant_am, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
glycoproteins <- read.delim("./Input/glycoproteins.tsv")$protein
Copied from Manasa’s oopsFinal.Rmd notebook
# We will add a bit more information to the fData file
# 1. List of known RBPs across cell lines in the XRNAX paper (Table S2)
xrnax = read.delim("./Input/xrnax-genelist.txt",sep="\t",header=T)
xrnax.rbps = xrnax %>%
dplyr::filter(!is.na(MCF7.RBP) | !is.na(HEK293.RBP) | !is.na(HeLa.RBP)) %>%
dplyr::select(Uniprot.ID:Protein.name)
rownames(xrnax.rbps) = xrnax.rbps$Uniprot.ID
print(length(rownames(xrnax.rbps)))
[1] 1753
# Check how many are common to the cell lines in the XRNAX paper
xrnax %>%
dplyr::select(MCF7.RBP:ihRBP) %>%
apply(2, table,useNA="always")
MCF7.RBP HEK293.RBP HeLa.RBP ihRBP
non-poly(A) 617 698 565 775
poly(A) 590 659 674 978
<NA> 1276 1126 1244 730
# 2. List of RBPs from SILAC experiments using OOPS after wash step2 (Table S1)
oops = read.delim("./Input/oops-genelist.txt",sep="\t",header=T)
oops.rbps = oops %>%
dplyr::filter(CL_NC_Ratio >= 1.0) %>%
dplyr::select(master_protein, RBP_glyco)
oops_rbps <- unique(oops.rbps$master_protein)
print(length(oops_rbps))
[1] 405
protein_quant_am_no_glyco <- protein_quant_am[!rownames(protein_quant_am) %in% glycoproteins,]
fData(protein_quant_am_no_glyco)$oops <- rownames(protein_quant_am_no_glyco) %in% oops_rbps
fData(protein_quant_am_no_glyco)$xrnax <- rownames(protein_quant_am_no_glyco) %in% rownames(xrnax.rbps)
fData(protein_quant_am_no_glyco)$go_rbp <- rownames(protein_quant_am_no_glyco) %in% GO_RBPs
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax))
FALSE TRUE
FALSE 176 198
TRUE 21 239
print(table(fData(protein_quant_am_no_glyco)$oops, fData(protein_quant_am_no_glyco)$xrnax,
fData(protein_quant_am_no_glyco)$go_rbp))
, , = FALSE
FALSE TRUE
FALSE 149 102
TRUE 16 31
, , = TRUE
FALSE TRUE
FALSE 27 96
TRUE 5 208
print(dim(protein_quant_am))
[1] 733 15
print(dim(protein_quant_am_no_glyco))
[1] 634 15
marker_classes <- getMarkerClasses(protein_quant_am_no_glyco, "markers")
m_colours <- getColours(marker_classes)
p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2.png")
Saving 7.29 x 4.5 in image
p <- plotPCA(protein_quant_am_no_glyco, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_3_4.png")
Saving 7.29 x 4.5 in image
protein_quant_am_no_glyco_yes_rbp <- protein_quant_am_no_glyco[
(fData(protein_quant_am_no_glyco)$oops |
fData(protein_quant_am_no_glyco)$xrnax |
fData(protein_quant_am_no_glyco)$go_rbp),]
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order)
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps.png")
Saving 7.29 x 4.5 in image
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, dims=c(3,4))
the condition has length > 1 and only the first element will be used
print(p)
$p
ggsave("./Output/plots/pca_3_4_no_glyco_only_rbps.png")
Saving 7.29 x 4.5 in image
translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=translocon)
the condition has length > 1 and only the first element will be used
print(p)
$p
$p_foi
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_translocon.png")
Saving 7.29 x 4.5 in image
print(dim(protein_quant_am_no_glyco_yes_rbp))
[1] 485 15
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "markers", m_colours=m_colours, re_order_markers=TRUE, marker_levels=organelle_order, foi=para)
the condition has length > 1 and only the first element will be used
print(p)
$p
$p_foi
ggsave("./Output/plots/pca_1_2_no_glyco_only_rbps_paraspeckles.png")
Saving 7.29 x 4.5 in image
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes, organelle_order=organelle_order) +
facet_wrap(~markers) +
scale_colour_manual(values=m_colours, guide=FALSE) +
theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)
ggsave("./Output/plots/marker_profiles.png")
Saving 10 x 10 in image
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "markers", keep_markers=marker_classes,
organelle_order=organelle_order, unknown=TRUE) +
facet_grid(zero_imputation_n~missing_n) +
scale_colour_manual(values=m_colours, guide=FALSE) +
theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)
ggsave("./Output/plots/marker_profiles_imputation.png")
Saving 10 x 10 in image
Now we make a new set of markers designed to highlight functional groups of RBPs more usefully. First we need to define new sets of GO annotation proteins, where each marker belongs to only one group
translocon <- human_go %>% filter(GO.ID=='GO:0071256') %>% pull(UNIPROTKB)
para <- human_go %>% filter(GO.ID=='GO:0042382') %>% pull(UNIPROTKB)
mrna_splicing <- human_go %>% filter(GO.ID=='GO:0000398') %>% pull(UNIPROTKB)
translation_init <- human_go %>% filter(GO.ID=='GO:0006413') %>% pull(UNIPROTKB)
translation_init <- setdiff(translation_init, names(markers_proteins)[markers_proteins=="RIBOSOME"])
cell_cell_adhesion <- human_go %>% filter(GO.ID=='GO:0098609') %>% pull(UNIPROTKB)
cytoskeleton <- human_go %>% filter(GO.ID=='GO:0005856') %>% pull(UNIPROTKB)
motor_activity <- human_go %>% filter(GO.ID=='GO:0003774') %>% pull(UNIPROTKB)
er_stress_response <- human_go %>% filter(GO.ID=='GO:0030968') %>% pull(UNIPROTKB)
nuclear_pore_channel <- human_go %>% filter(GO.ID=='GO:0044613') %>% pull(UNIPROTKB)
nuclear_pore_basket <- human_go %>% filter(GO.ID=='GO:0044615') %>% pull(UNIPROTKB)
tRNA_AA <- human_go %>% filter(GO.ID=='GO:0004812') %>% pull(UNIPROTKB)
#mrna_splicing <- setdiff(mrna_splicing, c(para, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity))
#translation_init <- setdiff(translation_init, c(para, cell_cell_adhesion, cytoskeleton, motor_activity))
#cytoskeleton <- setdiff(cytoskeleton, c(para, cell_cell_adhesion, motor_activity))
#cell_cell_adhesion <- setdiff(cell_cell_adhesion, c(para, motor_activity))
#all_markers <- c(mrna_splicing, para, translocon, translation_init, cell_cell_adhesion, cytoskeleton, motor_activity)
#print(length(all_markers)==length(unique(all_markers)))
mrna_splicing <- setdiff(mrna_splicing, tRNA_AA)
all_markers <- c(mrna_splicing, tRNA_AA)
print(length(all_markers)==length(unique(all_markers)))
[1] TRUE
rbps_markers <- markers_proteins
localisations_to_remove <- c("PEROXISOME", "PROTEASOME", "GOLGI", "LYSOSOME")
rbps_markers <- rbps_markers[!rbps_markers %in% localisations_to_remove]
rbps_markers[rbps_markers =="NUCLEUS-CHROMATIN"] <- "NUCLEUS"
print(table(rbps_markers))
rbps_markers
CYTOSOL ER MITOCHONDRIA NUCLEUS PM RIBOSOME
96 118 197 171 295 72
new_markers <- c(#rep("PARASPECKLES", length(para)),
rep("mRNA splicing", length(mrna_splicing)),
rep("Aminoacyl-tRNA ligase", length(tRNA_AA)),
"PARP1"#,
#rep("TRANSLATION INITITAION", length(translation_init)),
#rep("CELL-CELL ADHESION", length(cell_cell_adhesion)),
#rep("MOTOR", length(motor_activity))#,
#rep("CYTOSKELETON", length(cytoskeleton))
)
names(new_markers) <- c(#para,
mrna_splicing,
tRNA_AA,
"P09874"#,
#translation_init,
#cell_cell_adhesion,
#motor_activity#,
#cytoskeleton
)
print(table(names(new_markers))[table(names(new_markers))>1])
named integer(0)
rbps_markers <- rbps_markers[!names(rbps_markers) %in% names(new_markers)]
combined_markers <- c(rbps_markers, new_markers)
print(table(combined_markers))
combined_markers
Aminoacyl-tRNA ligase CYTOSOL ER MITOCHONDRIA mRNA splicing
42 93 118 193 321
NUCLEUS PARP1 PM RIBOSOME
149 1 295 72
print(table(names(combined_markers))[table(names(combined_markers))>1])
named integer(0)
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- NULL
protein_quant_am_no_glyco_yes_rbp <- addMarkers(protein_quant_am_no_glyco_yes_rbp, combined_markers, "new_markers")
Markers in data: 168 out of 485
organelleMarkers
Aminoacyl-tRNA ligase CYTOSOL ER MITOCHONDRIA mRNA splicing
13 5 7 2 86
NUCLEUS PARP1 PM RIBOSOME unknown
12 1 3 39 317
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers <- recode(
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers, "NUCLEUS"="Nucleus", "RIBOSOME"="Ribosome", "CYTOSOL"="Cytosol",
"MITOCHONDRIA"="Mitochondria")
print(table(fData(protein_quant_am_no_glyco_yes_rbp)$new_markers))
Aminoacyl-tRNA ligase Cytosol ER Mitochondria mRNA splicing
13 5 7 2 86
Nucleus PARP1 PM Ribosome unknown
12 1 3 39 317
After adding these new RBP markers, we only have 1 PM and 2 Mt proteins remaining. Let’s remove the PM protein
fData(protein_quant_am_no_glyco_yes_rbp)$new_markers[fData(protein_quant_am_no_glyco_yes_rbp)$new_markers=="PM"] <- "unknown"
new_markers_levels <- getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_channel)
the condition has length > 1 and only the first element will be used
print(p)
$p
$p_foi
p <- plotPCA(protein_quant_am_no_glyco_yes_rbp, "new_markers", re_order_markers=TRUE, marker_levels=new_markers_levels,
m_colours=getClassColours()[c(1:5, 7, 10:20)], foi=nuclear_pore_basket)
the condition has length > 1 and only the first element will be used
print(p)
$p
$p_foi
getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers")
[1] "Aminoacyl-tRNA ligase" "Cytosol" "ER" "Mitochondria"
[5] "mRNA splicing" "Nucleus" "PARP1" "Ribosome"
set.seed(1)
proj <- make_proj("t-SNE", protein_quant_am_no_glyco_yes_rbp, "new_markers")
library(Hmisc)
Loading required package: lattice
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:robustbase’:
heart
Loading required package: Formula
Attaching package: ‘Hmisc’
The following object is masked from ‘package:AnnotationDbi’:
contents
The following objects are masked from ‘package:MSnbase’:
impute, naplot
The following object is masked from ‘package:Biobase’:
contents
The following objects are masked from ‘package:dplyr’:
src, summarize
The following objects are masked from ‘package:base’:
format.pval, units
proj_df <- proj$PCA_df
marker_levels <- setdiff(new_markers_levels, "unknown")
marker_levels <- marker_levels[c(2,3,4,6,8,1,5,7)]
#m_colours <- getStockcol()[c(1,7,4,3,2,5)]
m_colours <- c(cbPalette[c(6,3,2,4,8,7,5)], "grey20")
proj_df$markers <- factor(proj_df$new_markers, levels=marker_levels)
print(table(is.na(proj_df$markers)))
FALSE TRUE
165 320
proj_df$unknown <- proj_df$new_markers=="unknown"
p <- ggplot(proj_df, aes(X, Y, colour=markers)) +
geom_point(aes(X, Y, colour=markers, alpha=unknown), size=2) +
scale_alpha_manual(values=c(1,0.2), guide=F) +
my_theme +
scale_colour_manual(values=m_colours, na.value="grey60", name="", breaks=c(marker_levels)) +
guides(colour = guide_legend(override.aes = list(alpha = 1, size=2))) +
xlab("Dimension 1") + ylab("Dimension 2")
print(p)
ggsave("./Output/tSNE_RBP_markers.png")
Saving 7.29 x 4.5 in image
write.table(proj_df, "./Output/tSNE_projections.tsv", sep="\t", quote=FALSE, row.name=FALSE)
p <- PlotMarkerProfiles(protein_quant_am_no_glyco_yes_rbp, "new_markers",
keep_markers=getMarkerClasses(protein_quant_am_no_glyco_yes_rbp, "new_markers"), organelle_order=new_markers_levels) +
facet_wrap(~markers) +
scale_colour_manual(values=getClassColours()[c(1:5, 7, 10:20)], guide=FALSE) +
theme(legend.position="none")
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
print(p)
protein_info <- read.delim("Input/Aggregated-proteins-2187-with-uniprot.tab") %>%
dplyr::select(Entry, gene_name=Gene.names...primary..)
total.prot = readRDS("./Input/prot_res_20_fractions_imputed_markers.rds")
colnames(total.prot)[12] <- "0.948"
colnames(total.prot) <- c(seq(1,20,2), seq(2,20,2))
total.prot <- total.prot[,order(as.numeric(as.character(colnames(total.prot))))]
total.prot <- total.prot[,5:19]
total.prot = normalise(total.prot,"sum")
rbp.prot <- protein_quant_am_no_glyco_yes_rbp
colnames(rbp.prot) <- 5:19
print(dim(total.prot))
[1] 5610 15
print(dim(rbp.prot))
[1] 485 15
print(colnames(total.prot))
[1] "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19"
print(colnames(rbp.prot))
[1] "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19"
plotCombinedProfiles <- function(foi){
foi_in_total <- intersect(foi, rownames(total.prot))
foi_in_rbp <- intersect(foi, rownames(rbp.prot))
foi_in_both <- intersect(foi_in_rbp, foi_in_total)
print(foi_in_both)
if(length(foi_in_both)==0){
return(NA)
}
total_exprs_df <- exprs(total.prot[foi_in_both,])
total_exprs_df <- melt(total_exprs_df)
total_exprs_df$type = "All protein"
rbp_exprs_df <- exprs(rbp.prot[foi_in_both,])
rbp_exprs_df <- melt(rbp_exprs_df)
rbp_exprs_df$type = "RNA-bound"
#print(head(total_exprs_df))
#print(head(rbp_exprs_df))
#print(head(rbind(rbp_exprs_df, total_exprs_df)))
p <- rbind(rbp_exprs_df, total_exprs_df) %>%
merge(protein_info, by.x="Var1", by.y="Entry") %>%
ggplot(aes(Var2, value, colour=type, group=type)) +
my_theme + geom_line() +
theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
xlab("Fraction") + ylab("Sumn normalised\nabundance") +
scale_colour_discrete(name="", na.value="grey")
if(length(foi_in_both)>1){
p <- p + facet_wrap(~gene_name)
}
print(p)
p2 <- p <- rbind(rbp_exprs_df, total_exprs_df) %>%
merge(protein_info, by.x="Var1", by.y="Entry") %>%
ggplot(aes(Var2, value, colour=type, group=interaction(type, gene_name))) +
my_theme + geom_line() +
theme(aspect.ratio=1, axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
xlab("Fraction") + ylab("Sumn normalised\nabundance") +
scale_colour_discrete(name="", na.value="grey")
print(p2)
invisible(list("p"=p, "p2"=p2))
}
plotCombinedProfiles(tRNA_AA)
[1] "Q9NSE4" "P41252" "P54577" "P49589" "P49591" "P14868" "O43776" "P07814" "P47897" "P26639" "P26640" "P49588"
[13] "P41250"
#plotCombinedProfiles(para[[2]])
plotCombinedProfiles(motor_activity)
[1] "Q14203" "P33176" "O00159" "Q14204" "P35579" "P52732" "Q13409" "P35580" "P60660"
fvarLabels(rbp.prot)
[1] "Checked" "Confidence" "Sequence"
[4] "Modifications" "Qvality.PEP" "Qvality.q.value"
[7] "Number.of.Protein.Groups" "Number.of.Proteins" "Number.of.PSMs"
[10] "Master.Protein.Accessions" "Number.of.Missed.Cleavages" "Theo.MHplus.in.Da"
[13] "XCorr.Sequest.HT" "Confidence.Sequest.HT" "Percolator.q.Value.Sequest.HT"
[16] "Percolator.PEP.Sequest.HT" "master_protein" "protein_length"
[19] "protein_description" "peptide_start" "peptide_end"
[22] "crap_protein" "associated_crap_protein" "unique"
[25] "filename" "zero_imputation" "zero_imputation_n"
[28] "missing" "missing_n" "CV.F6"
[31] "CV.F7" "CV.F8" "CV.F9"
[34] "CV.F10" "CV.F11" "CV.F12"
[37] "CV.F13" "CV.F14" "CV.F15"
[40] "CV.F16" "CV.F17" "CV.F18"
[43] "CV.F19" "CV.F20" "markers"
[46] "oops" "xrnax" "go_rbp"
[49] "new_markers"
well_quantified_rbps <- fData(rbp.prot) %>%
filter(zero_imputation_n<=4, missing_n<=2) %>%
pull(master_protein)
plotCombinedProfiles(intersect(mrna_splicing, well_quantified_rbps))
[1] "Q01081" "Q00839" "P67809" "P11142" "O43290" "Q13247" "Q15366" "Q8IYB3"
ribosome_proteins <- names(markers_proteins)[markers_proteins=="RIBOSOME"]
plotCombinedProfiles(intersect(ribosome_proteins, well_quantified_rbps))
[1] "P15880" "P46781" "P35268" "P62750" "Q02543" "P36578" "P47914"
plotCombinedProfiles("Q15942")
[1] "Q15942"
p <- plotCombinedProfiles("P09874")
[1] "P09874"
ggsave("./Output/PARP1_profiles.png", p$p2)
Saving 7.29 x 4.5 in image